********************************************************************************
*DYNAMIC IMPACTS OF PRICING GROUNDWATER
*Bruno, Jessoe, Hanemann in JAERE
*
*EXTRACTION DATA PRE-PROCESSING CODE: 
*Takes raw recycled water data and converts to Stata long format.
*Takes raw groundwater production data and converts to Stata long format. 
*Generates year_run variable and parcel to well crosswalk files.
*Aggregates to parcel and well levels and saves for use in parcel_clean.do and well_clean.do.
********************************************************************************
********************************************************************************
clear all
capture log close
set more off

*SELECT OUTPUT DATE
global outputdate = "20230615"		 

*SET DIRECTORY
cd  "D:\Ellen\Dropbox\Pajaro_AgInnovation" 

********************************************************************************
*REFORMAT RAW RECYCLED/DELIVERED WATER DATA
********************************************************************************
*INPUT RAW DELIVERED WATER DATA
clear
import excel "Data\Raw_Data\PV_Water_Metered_Wells_Turnouts_Production_Quarterly_Annual_AF_through_2020.xlsx", sheet("Turnouts_SubAnnual_Prod_AF") firstrow

*PARSE OUT BY DATE TO REARRANGE
local myyear 2003 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
local myquarter 1 2 3 4 

foreach year of local myyear {
	foreach quarter of local myquarter {
preserve
keep x`year'_Q`quarter' SiteName
gen year = `year'
gen quarter = `quarter'
rename x`year'_Q`quarter' delivered
label variable delivered "recycled water deliveries"
rename SiteName turnout

save "Data\Deliveries_by_Date\Recycled_`year'_Q`quarter'_$outputdate.dta", replace
restore
}
}

*PARSE OUT BY DATE TO REARRANGE
local myyear 2004
local myquarter 1 2 3 

foreach year of local myyear {
	foreach quarter of local myquarter {
preserve
keep x`year'_Q`quarter' SiteName
gen year = `year'
gen quarter = `quarter'
rename x`year'_Q`quarter' delivered
label variable delivered "recycled water deliveries"
rename SiteName turnout

save "Data\Deliveries_by_Date\Recycled_`year'_Q`quarter'_$outputdate.dta", replace
restore
}
}

*APPEND TO EACH OTHER
clear
use "Data\Deliveries_by_Date\Recycled_2004_Q1_$outputdate.dta"
append using "Data\Deliveries_by_Date\Recycled_2004_Q2_$outputdate.dta"
append using "Data\Deliveries_by_Date\Recycled_2004_Q3_$outputdate.dta"


local myyear 2003 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
local myquarter 1 2 3 4 

foreach year of local myyear {
	foreach quarter of local myquarter {
append using "Data\Deliveries_by_Date\Recycled_`year'_Q`quarter'_$outputdate.dta"
}
}

*SAVE DELIVERED WATER DATA IN STATA LONG FORMAT
save "Data\Delivered_water_byturnout_$outputdate.dta", replace

collapse (sum) delivered, by (year quarter)

*ADD IN ZEROS FOR OUTSIDE DELIVERED WATER ZONE
gen inside =1 
append using "Data\Delivered_water_outside_20210601.dta"

*SAVE DELIVERED WATER DATA IN STATA LONG FORMAT

save "Data\Delivered_water_aggregate_$outputdate.dta", replace

********************************************************************************

*GENERATE YEAR_RUN VARIABLE
drop if year <2005
drop if year==2005 & quarter!=4
sort year quarter
egen quarter_running=group(year quarter)

gen year_run=.
replace year_run=1 if quarter_running<5
replace year_run=2 if quarter_running>4 & quarter_running<9
replace year_run=3 if quarter_running>8 & quarter_running<13
replace year_run=4 if quarter_running>12 & quarter_running<17
replace year_run=5 if quarter_running>16 & quarter_running<21
replace year_run=6 if quarter_running>20 & quarter_running<25
replace year_run=7 if quarter_running>24 & quarter_running<29
replace year_run=8 if quarter_running>28 & quarter_running<33
replace year_run=9 if quarter_running>32 & quarter_running<37
replace year_run=10 if quarter_running>36 & quarter_running<41
replace year_run=11 if quarter_running>40 & quarter_running<45
replace year_run=12 if quarter_running>44 & quarter_running<49
replace year_run=13 if quarter_running>48 & quarter_running<53
replace year_run=14 if quarter_running>52 & quarter_running<57
replace year_run=15 if quarter_running>56 & quarter_running<61
drop if year_run==.

collapse (sum) delivered, by (year_run inside)
replace year_run =2006 if year_run==1
replace year_run =2007 if year_run==2
replace year_run =2008 if year_run==3
replace year_run =2009 if year_run==4
replace year_run =2010 if year_run==5
replace year_run =2011 if year_run==6
replace year_run =2012 if year_run==7
replace year_run =2013 if year_run==8
replace year_run =2014 if year_run==9
replace year_run =2015 if year_run==10
replace year_run =2016 if year_run==11
replace year_run =2017 if year_run==12
replace year_run =2018 if year_run==13
replace year_run =2019 if year_run==14
replace year_run =2020 if year_run==15
rename year_run year

sort inside year
gen delivered_lag = delivered[_n-1]
replace delivered_lag =. if delivered_lag==0 & inside==1
save "Data\Delivered_water_yearrun_$outputdate.dta", replace

********************************************************************************
*REFORMAT RAW EXTRACTION DATA
********************************************************************************

*INPUT RAW EXTRACTION DATA
clear
import excel "Data\Raw_Data\PV_Water_Metered_Wells_Turnouts_Production_Quarterly_Annual_AF_through_2020.xlsx", sheet("MeteredWells_SubAnnual_Prod_AF") firstrow

*PARSE OUT BY DATE TO REARRANGE
local myyear 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
local myquarter 1 2 3 4 

foreach year of local myyear {
	foreach quarter of local myquarter {
preserve
keep x`year'_Q`quarter' SiteNumber
gen year = `year'
gen quarter =`quarter'
rename x`year'_Q`quarter' extraction
label variable extraction "extraction"
rename SiteNumber sitenum

save "Data\Extraction_by_date\Extract_`year'_Q`quarter'_$outputdate.dta", replace
outsheet sitenum quarter year extraction using D:\Ellen\Dropbox\Pajaro_AgInnovation\Data\Extraction_by_date\Extract_`year'_Q`quarter'_$outputdate.csv, comma replace 
restore
}
}

*APPEND TO EACH OTHER
clear
use "Data\Extraction_by_date\Extract_2003_Q1_$outputdate.dta"
append using "Data\Extraction_by_date\Extract_2003_Q2_$outputdate.dta"
append using "Data\Extraction_by_date\Extract_2003_Q3_$outputdate.dta"
append using "Data\Extraction_by_date\Extract_2003_Q4_$outputdate.dta"

local myyear 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
local myquarter 1 2 3 4 

foreach year of local myyear {
	foreach quarter of local myquarter {
append using "Data\Extraction_by_date\Extract_`year'_Q`quarter'_$outputdate.dta"
}
}
drop if extraction ==.
drop if sitenum ==0

*SAVE EXTRACTION DATA IN STATA LONG FORMAT
save "Data\Extraction_$outputdate.dta", replace

********************************************************************************
*GENERATE PARCEL-TO-WELL AND DWZ-TO-WELL CROSSWALK DATASETS
********************************************************************************

clear
*INPUT FILE THAT MATCHES WELLS TO PARCELS (OUTPUT FROM ARCPY)
import excel "Data\parcel_wells_20210520.xls", sheet("parcel_wells_20210520") firstrow
rename Well_Num sitenum
destring APN, force gen(apn)

drop OBJECTID JOIN_FID ID COUNTYID PERIMETER PVWMA USE WELL LAT LONG X_ft Y_ft Shape_Length Shape_Area AREA_GEO CENTROID NEAR_FID NEAR_DIST
drop if sitenum==.

*How many sitenums in dataset? 
by sitenum, sort: gen nvals_sitenum = _n == 1 
count if nvals_sitenum

*How many APN's in dataset? 
by apn, sort: gen nvals_apn = _n == 1 
count if nvals_apn

by TARGET_FID, sort: gen nvals_targetFID = _n ==1
count if nvals_targetFID

*drops parcel if it has no well on it
drop if Join_Count<1 

*TARGET FID=parcelnum (EQUIVALENT TO APN)
rename TARGET parcelnum
drop APN nvals* Join

*CREATES SITENUM TO PARCELNUM CROSSWALK FILE
save "Data\parcel_wells_$outputdate.dta", replace


*INPUT FILE THAT MATCHES WELLS TO DWZ (OUTPUT FROM ARCPY)
clear
import excel "Data\inside_wells_20210526.xls", sheet("inside_wells_20210526") firstrow
*Count number of join (number of wells Inside DWZ) *141/990 = 14.24% inside 
rename Well_Num sitenum
count if Join_Count
keep sitenum Join_Count
rename Join_Count inside
merge 1:1 sitenum using "Data\sitenumlist.dta"
replace inside =0 if inside ==.
drop _merge

*CREATES SITENUM TO DWZ CROSSWALK FILE
save "Data\inside_wells_$outputdate.dta", replace

********************************************************************************
*MERGE ALL TOGETHER AND AGGREGATE BY PARCEL
********************************************************************************
clear
use "Data\Extraction_$outputdate.dta", replace
merge m:1 sitenum using "Data\parcel_wells_$outputdate.dta"
drop _merge
merge m:1 sitenum using "Data\inside_wells_$outputdate.dta"
drop _merge
merge m:1 sitenum year using "Data\Pajaro_depth_20210526.dta"
drop _merge
merge m:1 year quarter using "Data\fiscal_year.dta"
drop _merge
drop if parcelnum==.
drop if year==.
egen county_code= group(COUNTY)
merge m:1 year county_code using "Data\Land_values\cropland_value_20210526.dta"
drop _merge

egen apn_count = group(apn)
egen parcel_count = group(parcelnum)
egen well_count = group(sitenum)
sum parcel_count well_count apn_count

*COLLAPSE AND SAVE BY PARCELNUM-DATE (PARCEL LEVEL)
preserve
collapse (sum) extraction (mean) gw_depth AREA (min) inside (mean) county_code rent_irrigated, by (parcelnum year quarter fiscal_year)
sum extraction
gen area_acres = AREA/43560
save "Data\parcel_extraction_$outputdate.dta", replace
by parcelnum, sort: gen nvals_parcel = _n == 1 
count if nvals_parcel
restore


*COLLAPSE AND SAVE BY SITENUM-DATE (WELL LEVEL)
preserve
collapse (sum) extraction (mean) gw_depth AREA (min) inside (mean) county_code rent_irrigated, by (sitenum parcelnum year quarter fiscal_year)
sum extraction
gen area_acres = AREA/43560
save "Data\well_extraction_$outputdate.dta", replace
by parcelnum, sort: gen nvals_parcel = _n == 1 
count if nvals_parcel
by sitenum, sort: gen nvals_wells = _n == 1 
count if nvals_wells
restore
